clear all
capture log close
program drop _all
set more off

log using "..\Programs/Preparing Merchandise Exports Numbers.log", replace

*** Preparing Merchandies Exports Numbers.do
*** 8/1/2016
*** Brina Seidel

*************************************
*** Define a program to clean country names
*************************************
program define CLEAN_COUNTRY_NAMES
	replace `1' = trim(`1')
	gen `1'_temp = `1'
	quietly replace `1'_temp = "Yemen, Rep." if strpos(`1', "Yemen") > 0 
	quietly replace `1'_temp = "Vietnam" if strpos(`1', "Viet") > 0
	quietly replace `1'_temp = "Venezuela, RB" if strpos(`1', "Venezuela") > 0
	quietly replace `1'_temp = "Macao SAR, China" if strpos(`1', "Macao") > 0
	quietly replace `1'_temp = "Hong Kong SAR, China" if strpos(`1', "Hong") > 0
	quietly replace `1'_temp = "China" if strpos(`1', "China") > 0 & strpos(`1', "Mainland") > 0
	quietly replace `1'_temp = "Lao PDR" if strpos(`1', "Lao") > 0
	quietly replace `1'_temp = "Korea, Dem. People’s Rep." if strpos(`1', "Korea") > 0 & (strpos(`1', "Dem") > 0 | strpos(`1', "North") > 0)
	quietly replace `1'_temp = "Korea, Rep." if strpos(`1', "Korea") > 0 & ((strpos(`1', "Dem") == 0  & strpos(`1', "North") == 0) | strpos(`1', "South") > 0)
	quietly replace `1'_temp = "Iran, Islamic Rep." if `1' == "Iran, Islamic Republic of"
	quietly replace `1'_temp = "Hong Kong SAR, China" if strpos(`1', "Hong Kong") > 0
	quietly replace `1'_temp = "Gambia, The" if strpos(`1', "Gambia") > 0
	quietly replace `1'_temp = "Congo, Rep." if strpos(`1', "Congo") > 0 & strpos(`1', "Dem") == 0
	quietly replace `1'_temp = "Congo, Dem. Rep." if strpos(`1', "Congo") > 0 & strpos(`1', "Dem") > 0
	quietly replace `1'_temp = "Congo, Dem. Rep." if strpos(`1', "Zaire") > 0
	quietly replace `1'_temp = "Bahamas, The" if strpos(`1', "Bahamas") > 0 
	quietly replace `1'_temp = "Egypt, Arab Rep." if strpos(`1', "Egypt") > 0 
	quietly replace `1'_temp = "Cote d'Ivoire" if strpos(`1', "Ivoire") > 0 
	quietly replace `1'_temp = "Bolivia" if strpos(`1', "Bolivia") > 0 
	quietly replace `1'_temp = "Tanzania" if strpos(`1', "Tanzania") > 0
	quietly replace `1'_temp = "Afghanistan" if strpos(`1', "Afghanistan") > 0
	quietly replace `1'_temp = "Sao Tome and Principe" if strpos(`1', "ncipe") > 0
	quietly replace `1'_temp = "Iran, Islamic Rep." if strpos(`1', "Iran") > 0
	quietly replace `1'_temp = "St. Vincent and the Grenadines" if strpos(`1', "St. Vincent")> 0 | strpos(`1', "Grenadines")> 0
	quietly replace `1'_temp = "Venezuela, RB" if strpos(`1', "Venezuela") > 0
	quietly replace `1'_temp = "Central African Republic" if strpos(`1', "Central African") > 0 
	quietly replace `1'_temp = "Kyrgyz Republic" if strpos(`1', "Kyrgyz") > 0 
	quietly replace `1'_temp = "Macedonia, FYR" if strpos(`1', "Macedonia") > 0 
	quietly replace `1'_temp = "Maldives" if strpos(`1', "Falkand") > 0 
	quietly replace `1'_temp = "Antigua and Barbuda" if strpos(`1', "Antigua") > 0 
	quietly replace `1'_temp = "Cabo Verde" if strpos(`1', "Verde") > 0 
	quietly replace `1'_temp = "Comoros" if strpos(`1', "Comoro") > 0
	quietly replace `1'_temp = "Guinea-Bissau" if strpos(`1', "Guinea") > 0 & strpos(`1', "Bissau") > 0
	quietly replace `1'_temp = "Haiti" if `1' == "Haïti"
	quietly replace `1'_temp = "Myanmar" if strpos(`1', "Burma") > 0
	quietly replace `1'_temp = "Syrian Arab Republic" if strpos(`1', "Syria") > 0
	quietly replace `1'_temp = "Brunei Darussalam" if strpos(`1', "Brunei") > 0 
	quietly replace `1'_temp = "St. Kitts and Nevis" if strpos(`1', "Kitts") >  0 & strpos(`1', "Nevis") > 0
	quietly replace `1'_temp = "Russian Federation" if strpos(`1', "Russia") > 0 
	quietly replace `1'_temp = "Micronesia, Fed. Sts." if strpos(`1', "Micronesia") > 0 
	quietly replace `1'_temp = "Timor-Leste" if strpos(`1', "Timor") > 0 
	quietly replace `1'_temp = "United States" if `1' == "United States of America"
	
	*** Show country names that we updated
	preserve
	keep if `1'_temp != `1'
	contract `1'_temp `1'
	list `1'_temp `1', ab(20) sep(100)
	restore
	replace `1' = `1'_temp
	drop `1'_temp
end

*************************************************************
*************************************************************
*** Merchandise exports as % GDP, 1870-1998 -- Maddison 
*************************************************************
*************************************************************

*************************************
*** Read in the data
*************************************
import excel using "..\Input Data\Globalization Input Data.xlsx", firstrow clear sheet("Maddison Merchandise Exports")
desc, f

*** Drop rows and observations that should not have been read in
drop if merch_exp_pct == .
drop C-F

*** Check the values
*twoway line merch_exp_pct year
tab year, m

*** Save tempfile
tempfile merch_exp1870_1998
save `merch_exp1870_1998'.dta, replace

*************************************************************
*************************************************************
*** Merchandise exports, 1960-2015 -- WTO
*************************************************************
*************************************************************

*************************************
*** Read in the data
*************************************
wbopendata, clear indicator(TX.VAL.MRCH.CD.WT) nometadata

*** Keep only necessary values
drop if region == "Aggregates" | region == ""

*** Reshape the data
rename yr* merch_exp* 
reshape long merch_exp, i(countryname) j(year)

*************************************
*** Clean missing data:
*** 1. For cases where the nonmissing data points are > 5 years apart, we will drop the country from the sample
*** 2. For cases where the nonmissing data points are <= 5 years apart, we will interpolate the missing values, assuming a constant growth rate
*** 3. For cases where the missing data points are at the beginning of the series, we will extend the earliest ratio of merchandise exports to GDP backwards
*************************************

*** 1. For cases where the nonmissing data points are > 5 years apart, we will drop the country from the sample

*** Mark countries that are missing data because the countries did not exist yet -- these are not truly "missing"
*** ".a" will refer to values that are truly missing, while ".b" will refer to values that are missing because the country did not exist
preserve
import excel using "..\Input Data\Globalization Input Data.xlsx", clear sheet("COW Year of Country Creation") firstrow
keep countryname start_year
tempfile start_year
save `start_year'.dta, replace
restore
merge m:1 countryname using `start_year'.dta, assert(1 3) nogen norep
replace merch_exp = .a if merch_exp == .
replace merch_exp = .b if year < start_year & start_year < .

*** Drop countries for which the nonmissing data points are > 5 years apart
sort countryname year
gen years_to_nonmiss = 0
forvalues i = 1/55 {
	quietly replace years_to_nonmiss = `i' if years_to_nonmiss == 0 &  merch_exp[_n+`i'] != .a & countryname[_n+`i'] == countryname
}
bys countryname: egen biggest_gap = max(years_to_nonmiss)
*** Also mark countries with no data at all to drop
gen any_data_temp = (merch_exp < .)
bys countryname: egen any_data = max(any_data_temp)
replace biggest_gap = . if any_data == 0
*** Show the countries we are dropping
tab countryname if biggest_gap > 5
drop if biggest_gap > 5

*** 2. For cases where the nonmissing data points are <= 5 years apart, we will interpolate the missing values, assuming a constant growth rate

ipolate merch_exp year if merch_exp < . | merch_exp == .a, gen(merch_exp_temp)
replace merch_exp = merch_exp_temp if merch_exp != .b
drop merch_exp_temp

*** 3. For cases where the missing data points are at the beginning of the series, we will extend the earliest ratio of merchandise exports to GDP backwards

*** First, mark the earliest year of data
gen has_data = year if merch_exp < .
bys countryname: egen first_data = min(has_data)
drop has_data
*** Merge in GDP
preserve
wbopendata, clear indicator(NY.GDP.MKTP.CD) nometadata
drop if region == "Aggregates" | region == ""
rename yr* gdp*
reshape long gdp, i(countryname) j(year)
keep gdp year countryname 
tempfile gdp 
save `gdp'.dta, replace
restore
merge 1:1 countryname year using `gdp'.dta, assert(2 3) keep(3) nogen norep
*** Extend the earliest ratio of merchandise exports to GDP backwards
gen first_ratio_temp = merch_exp/gdp if year == first_data
bys countryname: egen first_ratio = max(first_ratio_temp)
drop first_ratio_temp
replace merch_exp = first_ratio*gdp if merch_exp == .a & year < first_data 

*************************************
*** Save a list of countries and years with nonmissing data
*************************************
preserve
keep if merch_exp < . & year >= 1960 & year <= 1989
keep countryname year
tempfile exp_nonmiss1960_1989
save `exp_nonmiss1960_1989'.dta, replace
restore
preserve
keep if merch_exp < . & year >= 1990
keep countryname year
tempfile exp_nonmiss1990_2015
save `exp_nonmiss1990_2015'.dta, replace
restore

*************************************
*** Compare developing world GDP to the GDP of countries in our sample in 2015
*************************************
preserve
wbopendata, clear indicator(NY.GDP.MKTP.CD) year(2015) nometadata
keep if inlist(region, "Middle East & North Africa (all income levels)", "Latin America & Caribbean (all income levels)", "East Asia & Pacific (all income levels)", "South Asia", "Sub-Saharan Africa (all income levels)") ///
	& !inlist(countryname, "Japan", "Australia", "New Zealand")
rename yr* gdp_current*
reshape long gdp_current, i(countryname) j(year)
tempfile gdp_current
save `gdp_current'.dta, replace
restore
preserve
keep if year == 2015 & merch_exp < . 
merge 1:1 countryname year using `gdp_current'.dta
gen gdp_current_sample = gdp_current if _merge == 3
collapse (sum) gdp*, by(year)
format gdp* %16.0fc
gen pct_excluded = (gdp_current - gdp_current_sample)/gdp_current * 100
list year gdp_current gdp_current_sample pct_excluded
restore

*************************************
*** Add up global total 
*************************************
collapse (sum) merch_exp, by(year)

*** Save tempfile
keep year merch_exp
tempfile merch_exp1960_2015
save `merch_exp1960_2015'.dta, replace

*************************************************************
*************************************************************
*** GDP in GK Prices, 1960-1989 -- Maddison
*************************************************************
*************************************************************

*************************************
*** Read in the data
*************************************
import excel using "..\Input Data\Globalization Input Data.xlsx", clear sheet("Maddison GDP") cellrange(A3:GM199) firstrow

*** Rename variables
rename A countryname
foreach var of varlist B-GM {
	local lbl: variable label `var'
	if "`lbl'" == "" {
		assert `var' == .
		drop `var'
	}
	else {
		rename `var' gdp`lbl'
	}
}

*** Reshape the data
drop if countryname == ""
reshape long gdp, i(countryname) j(year)

*** Multiply by a million (data was saved in millions of 1990 GK dollars)
replace gdp = gdp*1000000

*** Keep only necessary years
keep if year >= 1960 & year <= 1989

*************************************
*** Remove GDP for countries/years that are missing merchandise export data (because we are using GDP in the denominator only for countries with nonmissing merchandise export data)
*** (However, Maddison's data is not completely disaggregated -- he includes categories like "Total 24 small Caribbean countries" that we cannot disaggregate, 
*** Therefore, we can't acutally drop GDP for countries within those small categories. These countries amount to a tine percentage of global GDP.)
*************************************

*** Prepare data for merging
CLEAN_COUNTRY_NAMES countryname
replace countryname = "Ethiopia" if countryname == "Eritrea and Ethiopia"
drop if strpos(lower(countryname), "total") & strpos(lower(countryname), "small")  == 0 & strpos(countryname, "USSR") == 0
drop if real(substr(countryname, 1, 1)) < .
merge 1:1 countryname year using `exp_nonmiss1960_1989'.dta, keep(1 3) /*Countries with _merge == 2 are mostly small countries that are included in Maddison's small country aggregates, and therefore not listed individually*/

*** Blank out GDP for those countries that are missing export data 
**** (that is, everything that did not merge with the list of nonmissing country-years except for the small country aggregates) 
replace gdp = . if _merge == 1 & strpos(lower(countryname), "small") == 0
drop _merge

*************************************
*** Add up the global total
*************************************
collapse (sum) gdp, by(year)
format gdp* %15.0fc

*** Save tempfile
keep year gdp
tempfile gdp1960_1989
save `gdp1960_1989'.dta, replace

*************************************************************
*************************************************************
*** GDP in international prices, 1990-2015 -- World Bank
*************************************************************
*************************************************************

*************************************
*** Read in the data 
*************************************
wbopendata, clear indicator(NY.GDP.MKTP.PP.CD) nometadata

*** Keep only necessary values
drop if region == "Aggregates" | region == ""

*** Reshape the data
rename yr* gdp* 
reshape long gdp, i(countryname) j(year)

*** Keep only necessary years
keep if year >= 1990 

*************************************
*** Remove GDP for countries/years that are missing merchandise export data 
***(because we are using GDP in the denominator only for countries with nonmissing merchandise export data)
*************************************
merge 1:1 countryname year using `exp_nonmiss1990_2015'.dta, assert(1 3)
replace gdp = . if _merge == 1
drop _merge

*************************************
*** Add up global total
*************************************
collapse (sum) gdp, by(year)

*** Save tempfile
tempfile gdp1990_2015
save `gdp1990_2015'.dta, replace

*************************************
*** Combine data on merchandise exports and GDP for 1960-2015
*************************************

*** Read in merchandise export data
use `merch_exp1960_2015'.dta, clear

*** For data between 1960 and 1989, use US CPI data to convert to 1990 prices
**** (so that we can divide by Maddison's GDP data, which is all in 1990 prices)
preserve
wbopendata, clear indicator(FP.CPI.TOTL) country(USA) nometadata
rename yr* cpi*
reshape long cpi, i(countryname) j(year)
keep year cpi
keep if year >= 1960 & year <= 1990
local cpi_1990 = cpi[_N]
drop if year == 1990
tempfil cpis
save `cpis'.dta, replace
restore
merge 1:1 year using `cpis'.dta, nogen 
replace merch_exp = merch_exp * `cpi_1990'/cpi if year >=1960 & year <=1989

*** Merge in GDP data
merge 1:1 year using `gdp1960_1989'.dta, assert(1 3) nogen 
merge 1:1 year using `gdp1990_2015'.dta, assert(1 3 4) nogen update 
assert gdp < .

*** Calculate merchandise exports as % GDP
gen merch_exp_pct = merch_exp/gdp * 100 

*** Save tempfile
keep year merch_exp_pct
tempfile merch_exp1960_2015
save `merch_exp1960_2015'.dta, replace

*************************************************************
*************************************************************
*** Combine merchandise export data for 1870-2015
*************************************************************
*************************************************************
clear all
append using `merch_exp1870_1998'.dta `merch_exp1960_2015'.dta, gen(src)
label define src 1 "Maddison" 2 "World Bank"
label values src src 

*** Check how close the values are when we have number from both sources
duplicates tag year, gen(dup)
sort year
format merch_exp_pct %10.1fc
list if dup, sepby(year) ab(20)
*** Keep the WB values
drop if dup > 0 & src == 1
drop dup src 

*************************************************************
*************************************************************
*** Save the data
*************************************************************
*************************************************************
desc, f
save "Merchandise Exports - Percent of World GDP.dta", replace

log close

